/* * This file is part of the URI Template library. * * For licensing information please see the file license.txt included in the release. * A copy of this licence can also be found at * http://www.opensource.org/licenses/artistic-license-2.0.php */ package org.weborganic.furi; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.text.Normalizer; import java.text.Normalizer.Form; /** * An encoder/decoder for use by URI templates. * * Only unreserved characters according to RFC 3986 do not need to be encoded within a variable: * * <pre> * unreserved = ALPHA / DIGIT / '-' / '.' / '_' / '˜'; * </pre> * * <p> * This encoder/decoder should be designed so that URI which contain only unreserved characters are * processed faster. * * @see <a href="http://tools.ietf.org/html/rfc3986">RFC 3986 - Uniform Resource Identifier (URI): * Generic Syntax<a/> * @see <a href="http://tools.ietf.org/html/rfc3986#appendix-A">RFC 3986 - Uniform Resource * Identifier (URI): Generic Syntax - Appendix A. Collected ABNF for URI</a> * @see <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html#Specification">UAX #15: * Unicode Normalization</a> * * @author Christophe Lauret * @version 11 June 2009 */ public class URICoder { /** * The UTF8 character set for reuse - Always defined. */ private final static Charset UTF8 = Charset.forName("UTF-8"); /** * The hexadecimal digits for use by the encoder. */ private final static char[] HEX_DIGITS = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; /** * Prevents creation of instances. */ private URICoder() {} // Encoder // ========================================================================== /** * Encodes the string as valid URI fragment. * * <p> * This encoder will encode all but unreserved characters using the escape sequence. * * @param s The string to encode. * * @return The corresponding encoded string. */ public static String encode(String s) { // invoke encode method with character that we know does not require encoding return encode(s, '0'); } /** * Encodes the string as valid URI fragment. * * <p> * This encoder will percent-encode all but <em>unreserved</em> characters. * * @param s The string to encode. * @param c An ASCII character that should not be encoded if found in the string. * * @return The corresponding encoded string. */ public static String encode(String s, char c) { if (s.length() == 0) { return s; } // Check whether we need to use UTF-8 encoder boolean ascii = isASCII(s); return ascii ? encode_ASCII(s, c) : encode_UTF8(s, c); } /** * Encodes the string as valid URI fragment. * * <p> * This encoder will percent-encode all but <em>illegal</em> characters. * * @param s The string to encode. * * @return The corresponding encoded string. */ public static String minimalEncode(String s) { if (s.length() == 0) { return s; } // Check whether we need to use UTF-8 encoder boolean ascii = isASCII(s); return ascii ? minimalEncode_ASCII(s) : minimalEncode_UTF8(s); } /** * Encodes a string containing only ASCII characters. * * @param s The string the encode (assuming ASCII characters only) * @param e A character that does not require encoding if found in the string. */ private static String encode_ASCII(String s, char e) { StringBuffer sb = new StringBuffer(); for (char c : s.toCharArray()) { if (isUnreserved((int) c) || c == e) { sb.append(c); } else { appendEscape(sb, c); } } return sb.toString(); } /** * Encodes a string containing only ASCII characters. * * @param s The string the encode (assuming ASCII characters only) */ private static String minimalEncode_ASCII(String s) { StringBuffer sb = new StringBuffer(); for (char c : s.toCharArray()) { if (isLegal((int) c)) { sb.append(c); } else { appendEscape(sb, c); } } return sb.toString(); } /** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s The string the encode (assuming ASCII characters only) * @param e A character that does not require encoding if found in the string. */ private static String encode_UTF8(String s, char e) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isUnreserved(b) || b == e) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); } /** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s The string the encode (assuming ASCII characters only) */ private static String minimalEncode_UTF8(String s) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isLegal(b)) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); } // Decoder // ========================================================================== /** * Decode the string as valid URI fragment. * * @param s The string to decode. * * @return The corresponding decoded string. */ public static String decode(String s) { if (s.length() == 0 || (s.indexOf('%') < 0 && s.indexOf('+') < 0)) { return s; } // Check whether we need to convert to UTF-8 encoder boolean ascii = isEncodedASCII(s); return ascii ? decode_ASCII(s) : decode_UTF8(s); } /** * Decodes a string containing only ASCII characters. */ private static String decode_ASCII(String s) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (c == '%') { if (i < s.length() - 2) { String hex = String.copyValueOf(new char[] {s.charAt(++i), s.charAt(++i)}); char x = (char) Integer.parseInt(hex, 16); sb.append(x); } // TODO: handle error condition } else if (c == '+') { sb.append(' '); } else { sb.append(c); } } return sb.toString(); } /** * Decodes a string containing non ASCII characters using an UTF-8 decoder. */ private static String decode_UTF8(String s) { // URI decode ByteBuffer bb = ByteBuffer.allocate(s.length()); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (c == '%') { if (i < s.length() - 2) { String hex = "" + s.charAt(++i) + s.charAt(++i); byte b = (byte) (Integer.parseInt(hex, 16)); bb.put(b); } } else if (c == '+') { bb.put((byte) ' '); } else { // TODO: could there be also non-ASCII characters that should have been encoded? bb.put((byte) c); } } bb.limit(bb.position()); bb.position(0); return UTF8.decode(bb).toString(); } /** * Appends the escape sequence for the given byte to the specified string buffer. * * @param sb The string buffer. * @param b The byte to escape. */ private static void appendEscape(StringBuffer sb, byte b) { sb.append('%'); sb.append(HEX_DIGITS[(b >> 4) & 0x0f]); sb.append(HEX_DIGITS[(b >> 0) & 0x0f]); } /** * Appends the escape sequence for the given byte to the specified string buffer. * * @param sb The string buffer. * @param c The byte to escape. */ private static void appendEscape(StringBuffer sb, char c) { sb.append('%'); sb.append(HEX_DIGITS[(c >> 4) & 0x0f]); sb.append(HEX_DIGITS[(c >> 0) & 0x0f]); } /** * Indicates whether the character is unreserved of not. * * @param c The character to test. * * @return <code>true</code> if it is unreserved; <code>false</code> otherwise. */ private static boolean isUnreserved(int c) { // ALPHA (lower) if (c >= 'a' && c <= 'z') { return true; // ALPHA (UPPER) } else if (c >= 'A' && c <= 'Z') { return true; // DIGIT } else if (c >= '0' && c <= '9') { return true; } else if (c == '.' || c == '_' || c == '-' || c == '~') { return true; } return false; } /** * Indicates whether the character is unreserved of not. * * @param c The character to test. * * @return <code>true</code> if it is unreserved; <code>false</code> otherwise. */ private static boolean isLegal(int c) { // Filter out [<26] if (c < '&' && c != '!' && c != '#' && c != '$') { return false; // Filter out [>7A] } else if (c >= '{' && c != '~') { return false; // Handle [26-7A] and '!', '#', '$', '~' } else if (c == '`' || c == '<' || c == '>' || c == '\\' || c == '^') { return false; } return true; } /** * Indicates whether the string contains non-ASCII characters. */ private static boolean isASCII(String s) { for (int i = 0; i < s.length(); i++) { if (s.charAt(i) >= 0x80) { return false; } } return true; } /** * Indicates whether the encoded string contains non-ASCII characters. */ private static boolean isEncodedASCII(String s) { for (int i = 0; i < s.length(); i++) { if (s.charAt(i) == '%' && i < s.length() - 1 && s.charAt(i + 1) > '7') { return false; } } return true; } }